{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.017754,
     "end_time": "2020-12-18T18:14:04.126287",
     "exception": false,
     "start_time": "2020-12-18T18:14:04.108533",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# SageMaker Debugger Profiling Report\n",
    "\n",
    "SageMaker Debugger auto generated this report. You can generate similar reports on all supported training jobs. The report provides summary of training job, system resource usage statistics, framework metrics, rules summary, and detailed analysis from each rule. The graphs and tables are interactive. \n",
    "\n",
    "**Legal disclaimer:** This report and any recommendations are provided for informational purposes only and are not definitive. You are responsible for making your own independent assessment of the information.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:04.164704Z",
     "iopub.status.busy": "2020-12-18T18:14:04.164213Z",
     "iopub.status.idle": "2020-12-18T18:14:04.692355Z",
     "shell.execute_reply": "2020-12-18T18:14:04.692745Z"
    },
    "papermill": {
     "duration": 0.549439,
     "end_time": "2020-12-18T18:14:04.693007",
     "exception": false,
     "start_time": "2020-12-18T18:14:04.143568",
     "status": "completed"
    },
    "tags": [
     "hide-output",
     "hide-input"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2020-12-18 18:28:16.491 ip-172-16-176-76:28403 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import glob\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import datetime\n",
    "from smdebug.profiler.utils import us_since_epoch_to_human_readable_time, ns_since_epoch_to_human_readable_time\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:04.732037Z",
     "iopub.status.busy": "2020-12-18T18:14:04.731544Z",
     "iopub.status.idle": "2020-12-18T18:14:04.957560Z",
     "shell.execute_reply": "2020-12-18T18:14:04.957136Z"
    },
    "papermill": {
     "duration": 0.247676,
     "end_time": "2020-12-18T18:14:04.957682",
     "exception": false,
     "start_time": "2020-12-18T18:14:04.710006",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "application/javascript": [
       "\n",
       "(function(root) {\n",
       "  function now() {\n",
       "    return new Date();\n",
       "  }\n",
       "\n",
       "  var force = true;\n",
       "\n",
       "  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n",
       "    root._bokeh_onload_callbacks = [];\n",
       "    root._bokeh_is_loading = undefined;\n",
       "  }\n",
       "\n",
       "  var JS_MIME_TYPE = 'application/javascript';\n",
       "  var HTML_MIME_TYPE = 'text/html';\n",
       "  var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n",
       "  var CLASS_NAME = 'output_bokeh rendered_html';\n",
       "\n",
       "  /**\n",
       "   * Render data to the DOM node\n",
       "   */\n",
       "  function render(props, node) {\n",
       "    var script = document.createElement(\"script\");\n",
       "    node.appendChild(script);\n",
       "  }\n",
       "\n",
       "  /**\n",
       "   * Handle when an output is cleared or removed\n",
       "   */\n",
       "  function handleClearOutput(event, handle) {\n",
       "    var cell = handle.cell;\n",
       "\n",
       "    var id = cell.output_area._bokeh_element_id;\n",
       "    var server_id = cell.output_area._bokeh_server_id;\n",
       "    // Clean up Bokeh references\n",
       "    if (id != null && id in Bokeh.index) {\n",
       "      Bokeh.index[id].model.document.clear();\n",
       "      delete Bokeh.index[id];\n",
       "    }\n",
       "\n",
       "    if (server_id !== undefined) {\n",
       "      // Clean up Bokeh references\n",
       "      var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n",
       "      cell.notebook.kernel.execute(cmd, {\n",
       "        iopub: {\n",
       "          output: function(msg) {\n",
       "            var id = msg.content.text.trim();\n",
       "            if (id in Bokeh.index) {\n",
       "              Bokeh.index[id].model.document.clear();\n",
       "              delete Bokeh.index[id];\n",
       "            }\n",
       "          }\n",
       "        }\n",
       "      });\n",
       "      // Destroy server and session\n",
       "      var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n",
       "      cell.notebook.kernel.execute(cmd);\n",
       "    }\n",
       "  }\n",
       "\n",
       "  /**\n",
       "   * Handle when a new output is added\n",
       "   */\n",
       "  function handleAddOutput(event, handle) {\n",
       "    var output_area = handle.output_area;\n",
       "    var output = handle.output;\n",
       "\n",
       "    // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n",
       "    if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n",
       "      return\n",
       "    }\n",
       "\n",
       "    var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n",
       "\n",
       "    if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n",
       "      toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n",
       "      // store reference to embed id on output_area\n",
       "      output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n",
       "    }\n",
       "    if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n",
       "      var bk_div = document.createElement(\"div\");\n",
       "      bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n",
       "      var script_attrs = bk_div.children[0].attributes;\n",
       "      for (var i = 0; i < script_attrs.length; i++) {\n",
       "        toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n",
       "      }\n",
       "      // store reference to server id on output_area\n",
       "      output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n",
       "    }\n",
       "  }\n",
       "\n",
       "  function register_renderer(events, OutputArea) {\n",
       "\n",
       "    function append_mime(data, metadata, element) {\n",
       "      // create a DOM node to render to\n",
       "      var toinsert = this.create_output_subarea(\n",
       "        metadata,\n",
       "        CLASS_NAME,\n",
       "        EXEC_MIME_TYPE\n",
       "      );\n",
       "      this.keyboard_manager.register_events(toinsert);\n",
       "      // Render to node\n",
       "      var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n",
       "      render(props, toinsert[toinsert.length - 1]);\n",
       "      element.append(toinsert);\n",
       "      return toinsert\n",
       "    }\n",
       "\n",
       "    /* Handle when an output is cleared or removed */\n",
       "    events.on('clear_output.CodeCell', handleClearOutput);\n",
       "    events.on('delete.Cell', handleClearOutput);\n",
       "\n",
       "    /* Handle when a new output is added */\n",
       "    events.on('output_added.OutputArea', handleAddOutput);\n",
       "\n",
       "    /**\n",
       "     * Register the mime type and append_mime function with output_area\n",
       "     */\n",
       "    OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n",
       "      /* Is output safe? */\n",
       "      safe: true,\n",
       "      /* Index of renderer in `output_area.display_order` */\n",
       "      index: 0\n",
       "    });\n",
       "  }\n",
       "\n",
       "  // register the mime type if in Jupyter Notebook environment and previously unregistered\n",
       "  if (root.Jupyter !== undefined) {\n",
       "    var events = require('base/js/events');\n",
       "    var OutputArea = require('notebook/js/outputarea').OutputArea;\n",
       "\n",
       "    if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n",
       "      register_renderer(events, OutputArea);\n",
       "    }\n",
       "  }\n",
       "\n",
       "  \n",
       "  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n",
       "    root._bokeh_timeout = Date.now() + 5000;\n",
       "    root._bokeh_failed_load = false;\n",
       "  }\n",
       "\n",
       "  var NB_LOAD_WARNING = {'data': {'text/html':\n",
       "     \"<div style='background-color: #fdd'>\\n\"+\n",
       "     \"<p>\\n\"+\n",
       "     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n",
       "     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n",
       "     \"</p>\\n\"+\n",
       "     \"<ul>\\n\"+\n",
       "     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n",
       "     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n",
       "     \"</ul>\\n\"+\n",
       "     \"<code>\\n\"+\n",
       "     \"from bokeh.resources import INLINE\\n\"+\n",
       "     \"output_notebook(resources=INLINE)\\n\"+\n",
       "     \"</code>\\n\"+\n",
       "     \"</div>\"}};\n",
       "\n",
       "  function display_loaded() {\n",
       "    var el = document.getElementById(null);\n",
       "    if (el != null) {\n",
       "      el.textContent = \"BokehJS is loading...\";\n",
       "    }\n",
       "    if (root.Bokeh !== undefined) {\n",
       "      if (el != null) {\n",
       "        el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n",
       "      }\n",
       "    } else if (Date.now() < root._bokeh_timeout) {\n",
       "      setTimeout(display_loaded, 100)\n",
       "    }\n",
       "  }\n",
       "\n",
       "\n",
       "  function run_callbacks() {\n",
       "    try {\n",
       "      root._bokeh_onload_callbacks.forEach(function(callback) {\n",
       "        if (callback != null)\n",
       "          callback();\n",
       "      });\n",
       "    } finally {\n",
       "      delete root._bokeh_onload_callbacks\n",
       "    }\n",
       "    console.debug(\"Bokeh: all callbacks have finished\");\n",
       "  }\n",
       "\n",
       "  function load_libs(css_urls, js_urls, callback) {\n",
       "    if (css_urls == null) css_urls = [];\n",
       "    if (js_urls == null) js_urls = [];\n",
       "\n",
       "    root._bokeh_onload_callbacks.push(callback);\n",
       "    if (root._bokeh_is_loading > 0) {\n",
       "      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n",
       "      return null;\n",
       "    }\n",
       "    if (js_urls == null || js_urls.length === 0) {\n",
       "      run_callbacks();\n",
       "      return null;\n",
       "    }\n",
       "    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n",
       "    root._bokeh_is_loading = css_urls.length + js_urls.length;\n",
       "\n",
       "    function on_load() {\n",
       "      root._bokeh_is_loading--;\n",
       "      if (root._bokeh_is_loading === 0) {\n",
       "        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n",
       "        run_callbacks()\n",
       "      }\n",
       "    }\n",
       "\n",
       "    function on_error() {\n",
       "      console.error(\"failed to load \" + url);\n",
       "    }\n",
       "\n",
       "    for (var i = 0; i < css_urls.length; i++) {\n",
       "      var url = css_urls[i];\n",
       "      const element = document.createElement(\"link\");\n",
       "      element.onload = on_load;\n",
       "      element.onerror = on_error;\n",
       "      element.rel = \"stylesheet\";\n",
       "      element.type = \"text/css\";\n",
       "      element.href = url;\n",
       "      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n",
       "      document.body.appendChild(element);\n",
       "    }\n",
       "\n",
       "    for (var i = 0; i < js_urls.length; i++) {\n",
       "      var url = js_urls[i];\n",
       "      var element = document.createElement('script');\n",
       "      element.onload = on_load;\n",
       "      element.onerror = on_error;\n",
       "      element.async = false;\n",
       "      element.src = url;\n",
       "      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n",
       "      document.head.appendChild(element);\n",
       "    }\n",
       "  };\n",
       "\n",
       "  function inject_raw_css(css) {\n",
       "    const element = document.createElement(\"style\");\n",
       "    element.appendChild(document.createTextNode(css));\n",
       "    document.body.appendChild(element);\n",
       "  }\n",
       "\n",
       "  \n",
       "  var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.4.0.min.js\"];\n",
       "  var css_urls = [];\n",
       "  \n",
       "\n",
       "  var inline_js = [\n",
       "    function(Bokeh) {\n",
       "      Bokeh.set_log_level(\"info\");\n",
       "    },\n",
       "    function(Bokeh) {\n",
       "    \n",
       "    \n",
       "    }\n",
       "  ];\n",
       "\n",
       "  function run_inline_js() {\n",
       "    \n",
       "    if (root.Bokeh !== undefined || force === true) {\n",
       "      \n",
       "    for (var i = 0; i < inline_js.length; i++) {\n",
       "      inline_js[i].call(root, root.Bokeh);\n",
       "    }\n",
       "    } else if (Date.now() < root._bokeh_timeout) {\n",
       "      setTimeout(run_inline_js, 100);\n",
       "    } else if (!root._bokeh_failed_load) {\n",
       "      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n",
       "      root._bokeh_failed_load = true;\n",
       "    } else if (force !== true) {\n",
       "      var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n",
       "      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n",
       "    }\n",
       "\n",
       "  }\n",
       "\n",
       "  if (root._bokeh_is_loading === 0) {\n",
       "    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n",
       "    run_inline_js();\n",
       "  } else {\n",
       "    load_libs(css_urls, js_urls, function() {\n",
       "      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n",
       "      run_inline_js();\n",
       "    });\n",
       "  }\n",
       "}(window));"
      ],
      "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n  function now() {\n    return new Date();\n  }\n\n  var force = true;\n\n  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n    root._bokeh_onload_callbacks = [];\n    root._bokeh_is_loading = undefined;\n  }\n\n  \n\n  \n  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n    root._bokeh_timeout = Date.now() + 5000;\n    root._bokeh_failed_load = false;\n  }\n\n  var NB_LOAD_WARNING = {'data': {'text/html':\n     \"<div style='background-color: #fdd'>\\n\"+\n     \"<p>\\n\"+\n     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n     \"</p>\\n\"+\n     \"<ul>\\n\"+\n     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n     \"</ul>\\n\"+\n     \"<code>\\n\"+\n     \"from bokeh.resources import INLINE\\n\"+\n     \"output_notebook(resources=INLINE)\\n\"+\n     \"</code>\\n\"+\n     \"</div>\"}};\n\n  function display_loaded() {\n    var el = document.getElementById(null);\n    if (el != null) {\n      el.textContent = \"BokehJS is loading...\";\n    }\n    if (root.Bokeh !== undefined) {\n      if (el != null) {\n        el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n      }\n    } else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(display_loaded, 100)\n    }\n  }\n\n\n  function run_callbacks() {\n    try {\n      root._bokeh_onload_callbacks.forEach(function(callback) {\n        if (callback != null)\n          callback();\n      });\n    } finally {\n      delete root._bokeh_onload_callbacks\n    }\n    console.debug(\"Bokeh: all callbacks have finished\");\n  }\n\n  function load_libs(css_urls, js_urls, callback) {\n    if (css_urls == null) css_urls = [];\n    if (js_urls == null) js_urls = [];\n\n    root._bokeh_onload_callbacks.push(callback);\n    if (root._bokeh_is_loading > 0) {\n      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n      return null;\n    }\n    if (js_urls == null || js_urls.length === 0) {\n      run_callbacks();\n      return null;\n    }\n    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n    root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n    function on_load() {\n      root._bokeh_is_loading--;\n      if (root._bokeh_is_loading === 0) {\n        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n        run_callbacks()\n      }\n    }\n\n    function on_error() {\n      console.error(\"failed to load \" + url);\n    }\n\n    for (var i = 0; i < css_urls.length; i++) {\n      var url = css_urls[i];\n      const element = document.createElement(\"link\");\n      element.onload = on_load;\n      element.onerror = on_error;\n      element.rel = \"stylesheet\";\n      element.type = \"text/css\";\n      element.href = url;\n      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n      document.body.appendChild(element);\n    }\n\n    for (var i = 0; i < js_urls.length; i++) {\n      var url = js_urls[i];\n      var element = document.createElement('script');\n      element.onload = on_load;\n      element.onerror = on_error;\n      element.async = false;\n      element.src = url;\n      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n      document.head.appendChild(element);\n    }\n  };\n\n  function inject_raw_css(css) {\n    const element = document.createElement(\"style\");\n    element.appendChild(document.createTextNode(css));\n    document.body.appendChild(element);\n  }\n\n  \n  var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.4.0.min.js\"];\n  var css_urls = [];\n  \n\n  var inline_js = [\n    function(Bokeh) {\n      Bokeh.set_log_level(\"info\");\n    },\n    function(Bokeh) {\n    \n    \n    }\n  ];\n\n  function run_inline_js() {\n    \n    if (root.Bokeh !== undefined || force === true) {\n      \n    for (var i = 0; i < inline_js.length; i++) {\n      inline_js[i].call(root, root.Bokeh);\n    }\n    } else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(run_inline_js, 100);\n    } else if (!root._bokeh_failed_load) {\n      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n      root._bokeh_failed_load = true;\n    } else if (force !== true) {\n      var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n    }\n\n  }\n\n  if (root._bokeh_is_loading === 0) {\n    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n    run_inline_js();\n  } else {\n    load_libs(css_urls, js_urls, function() {\n      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n      run_inline_js();\n    });\n  }\n}(window));"
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import bokeh\n",
    "from bokeh.io import output_notebook, show\n",
    "from bokeh.layouts import column, row\n",
    "from bokeh.plotting import figure\n",
    "from bokeh.models.widgets import DataTable, DateFormatter, TableColumn\n",
    "from bokeh.models import ColumnDataSource, PreText\n",
    "from math import pi\n",
    "from bokeh.transform import cumsum\n",
    "import warnings\n",
    "from bokeh.models.widgets import Paragraph\n",
    "from bokeh.models import Legend\n",
    "from bokeh.util.warnings import BokehDeprecationWarning, BokehUserWarning\n",
    "warnings.simplefilter('ignore', BokehDeprecationWarning)\n",
    "warnings.simplefilter('ignore', BokehUserWarning)\n",
    "\n",
    "output_notebook(hide_banner=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.000078Z",
     "iopub.status.busy": "2020-12-18T18:14:04.999573Z",
     "iopub.status.idle": "2020-12-18T18:14:05.001198Z",
     "shell.execute_reply": "2020-12-18T18:14:05.001570Z"
    },
    "papermill": {
     "duration": 0.026309,
     "end_time": "2020-12-18T18:14:05.001709",
     "exception": false,
     "start_time": "2020-12-18T18:14:04.975400",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "def create_piechart(data_dict, title=None, height=400, width=400, x1=0, x2=0.1, radius=0.4, toolbar_location='right'):\n",
    "   \n",
    "    plot = figure(plot_height=height, \n",
    "                  plot_width=width,\n",
    "                  toolbar_location=toolbar_location,\n",
    "                  tools=\"hover,wheel_zoom,reset,pan\", \n",
    "                  tooltips=\"@phase:@value\", \n",
    "                  title=title,\n",
    "                  x_range=(-radius-x1, radius+x2))\n",
    "\n",
    "    data = pd.Series(data_dict).reset_index(name='value').rename(columns={'index':'phase'})\n",
    "    data['angle'] = data['value']/data['value'].sum() * 2*pi\n",
    "    data['color'] = bokeh.palettes.viridis(len(data_dict))\n",
    "\n",
    "    plot.wedge(x=0, y=0., radius=radius,\n",
    "        start_angle=cumsum('angle', include_zero=True), \n",
    "        end_angle=cumsum('angle'),\n",
    "        line_color=\"white\", \n",
    "        source=data, \n",
    "        fill_color='color', \n",
    "        legend='phase'\n",
    "              )\n",
    "    plot.legend.label_text_font_size = \"8pt\"\n",
    "    plot.legend.location = 'center_right'\n",
    "    plot.axis.axis_label=None\n",
    "    plot.axis.visible=False\n",
    "    plot.grid.grid_line_color = None\n",
    "    plot.outline_line_color = \"white\"\n",
    "    \n",
    "    return plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.040629Z",
     "iopub.status.busy": "2020-12-18T18:14:05.040160Z",
     "iopub.status.idle": "2020-12-18T18:14:05.042282Z",
     "shell.execute_reply": "2020-12-18T18:14:05.041863Z"
    },
    "papermill": {
     "duration": 0.023185,
     "end_time": "2020-12-18T18:14:05.042390",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.019205",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "from IPython.display import display, HTML, Markdown, Image\n",
    "def pretty_print(df):\n",
    "    raw_html = df.to_html().replace(\"\\\\n\",\"<br>\").replace('<tr>','<tr style=\"text-align: left;\">')\n",
    "    return display(HTML(raw_html))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.017373,
     "end_time": "2020-12-18T18:14:05.077409",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.060036",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Training job summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.117003Z",
     "iopub.status.busy": "2020-12-18T18:14:05.116454Z",
     "iopub.status.idle": "2020-12-18T18:14:05.118518Z",
     "shell.execute_reply": "2020-12-18T18:14:05.118121Z"
    },
    "papermill": {
     "duration": 0.023167,
     "end_time": "2020-12-18T18:14:05.118622",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.095455",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "def load_report(rule_name):\n",
    "    try:\n",
    "        report = json.load(open('/opt/ml/processing/output/rule/profiler-output/profiler-reports/'+rule_name+'.json'))\n",
    "        return report\n",
    "    except FileNotFoundError:\n",
    "        print (rule_name + ' not triggered')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.164903Z",
     "iopub.status.busy": "2020-12-18T18:14:05.163304Z",
     "iopub.status.idle": "2020-12-18T18:14:05.166649Z",
     "shell.execute_reply": "2020-12-18T18:14:05.167039Z"
    },
    "papermill": {
     "duration": 0.031054,
     "end_time": "2020-12-18T18:14:05.167179",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.136125",
     "status": "completed"
    },
    "tags": [
     "hide-input",
     "hide-output"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MaxInitializationTime not triggered\n"
     ]
    }
   ],
   "source": [
    "\n",
    "job_statistics = {}\n",
    "report = load_report('MaxInitializationTime')\n",
    "if report:\n",
    "    if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n",
    "        first_step = report['Details'][\"step_num\"][\"first\"]\n",
    "        last_step = report['Details'][\"step_num\"][\"last\"]\n",
    "    tmp = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)\n",
    "    date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n",
    "    day = date.date().strftime(\"%m/%d/%Y\")\n",
    "    hour = date.time().strftime(\"%H:%M:%S\")\n",
    "    job_statistics[\"Start time\"] = f\"{hour} {day}\"\n",
    "    tmp = us_since_epoch_to_human_readable_time(report['Details']['job_end'] * 1000000)\n",
    "    date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n",
    "    day = date.date().strftime(\"%m/%d/%Y\")\n",
    "    hour = date.time().strftime(\"%H:%M:%S\")\n",
    "    job_statistics[\"End time\"] = f\"{hour} {day}\"\n",
    "    job_duration_in_seconds = int(report['Details']['job_end'] - report['Details']['job_start']) \n",
    "    job_statistics[\"Job duration\"] = f\"{job_duration_in_seconds} seconds\"\n",
    "    if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n",
    "        tmp = us_since_epoch_to_human_readable_time(first_step)\n",
    "        date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n",
    "        day = date.date().strftime(\"%m/%d/%Y\")\n",
    "        hour = date.time().strftime(\"%H:%M:%S\")\n",
    "        job_statistics[\"Training loop start\"] = f\"{hour} {day}\"\n",
    "        tmp = us_since_epoch_to_human_readable_time(last_step)\n",
    "        date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n",
    "        day = date.date().strftime(\"%m/%d/%Y\")\n",
    "        hour = date.time().strftime(\"%H:%M:%S\")\n",
    "        job_statistics[\"Training loop end\"] = f\"{hour} {day}\"\n",
    "        training_loop_duration_in_seconds = int((last_step - first_step) / 1000000)\n",
    "        job_statistics[\"Training loop duration\"] = f\"{training_loop_duration_in_seconds} seconds\"\n",
    "        initialization_in_seconds = int(first_step/1000000 - report['Details']['job_start'])\n",
    "        job_statistics[\"Initialization time\"] = f\"{initialization_in_seconds} seconds\"\n",
    "        finalization_in_seconds = int(np.abs(report['Details']['job_end'] - last_step/1000000))\n",
    "        job_statistics[\"Finalization time\"] = f\"{finalization_in_seconds} seconds\"\n",
    "        initialization_perc = int(initialization_in_seconds / job_duration_in_seconds * 100)\n",
    "        job_statistics[\"Initialization\"] = f\"{initialization_perc} %\"\n",
    "        training_loop_perc = int(training_loop_duration_in_seconds / job_duration_in_seconds * 100)\n",
    "        job_statistics[\"Training loop\"] = f\"{training_loop_perc} %\"\n",
    "        finalization_perc = int(finalization_in_seconds / job_duration_in_seconds * 100)\n",
    "        job_statistics[\"Finalization\"] = f\"{finalization_perc} %\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.211542Z",
     "iopub.status.busy": "2020-12-18T18:14:05.210210Z",
     "iopub.status.idle": "2020-12-18T18:14:05.248389Z",
     "shell.execute_reply": "2020-12-18T18:14:05.248761Z"
    },
    "papermill": {
     "duration": 0.063864,
     "end_time": "2020-12-18T18:14:05.248906",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.185042",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "if report:\n",
    "    text =  \"\"\"The following table gives a summary about the training job. The table includes information about when the training job started and ended, how much time initialization, training loop and finalization took.\"\"\"\n",
    "    if len(job_statistics) > 0:\n",
    "        df = pd.DataFrame.from_dict(job_statistics, orient='index')\n",
    "        start_time = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)\n",
    "        date = datetime.datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%S:%f')\n",
    "        day = date.date().strftime(\"%m/%d/%Y\")\n",
    "        hour = date.time().strftime(\"%H:%M:%S\")\n",
    "        duration = job_duration_in_seconds\n",
    "        text = f\"\"\"{text} \\n Your training job started on {day} at {hour} and ran for {duration} seconds.\"\"\"\n",
    "\n",
    "        #pretty_print(df)\n",
    "        if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n",
    "            if finalization_perc  < 0:\n",
    "                job_statistics[\"Finalization%\"]  = 0\n",
    "            if training_loop_perc < 0:\n",
    "                job_statistics[\"Training loop\"] = 0\n",
    "            if initialization_perc < 0:\n",
    "                job_statistics[\"Initialization\"] = 0\n",
    "        else:\n",
    "            text = f\"\"\"{text} \\n Your training job started on {day} at {hour} and ran for {duration} seconds.\"\"\"\n",
    "            \n",
    "    if len(job_statistics) > 0:\n",
    "        df2 = df.reset_index()\n",
    "        df2.columns = [\"0\", \"1\"]\n",
    "        source = ColumnDataSource(data=df2)\n",
    "        columns = [TableColumn(field='0', title=\"\"),\n",
    "                   TableColumn(field='1', title=\"Job Statistics\"),]\n",
    "        table = DataTable(source=source, columns=columns, width=450, height=380)\n",
    "\n",
    "    plot = None\n",
    "\n",
    "    if \"Initialization\" in job_statistics:\n",
    "        piechart_data = {}\n",
    "        piechart_data[\"Initialization\"] = initialization_perc  \n",
    "        piechart_data[\"Training loop\"]  = training_loop_perc\n",
    "        piechart_data[\"Finalization\"]  = finalization_perc \n",
    "\n",
    "        plot = create_piechart(piechart_data, \n",
    "                               height=350,\n",
    "                               width=500,\n",
    "                               x1=0.15,\n",
    "                               x2=0.15,\n",
    "                               radius=0.15, \n",
    "                               toolbar_location=None)\n",
    "\n",
    "    if plot != None:\n",
    "        paragraph = Paragraph(text=f\"\"\"{text}\"\"\", width = 800)\n",
    "        show(column(paragraph, row(table, plot)))\n",
    "    else:\n",
    "        paragraph = Paragraph(text=f\"\"\"{text}. No step information was profiled from your training job. The time spent on initialization and finalization cannot be computed.\"\"\" , width = 800)\n",
    "        show(column(paragraph, row(table)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.018415,
     "end_time": "2020-12-18T18:14:05.285984",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.267569",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## System usage statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.326563Z",
     "iopub.status.busy": "2020-12-18T18:14:05.326098Z",
     "iopub.status.idle": "2020-12-18T18:14:05.328587Z",
     "shell.execute_reply": "2020-12-18T18:14:05.328133Z"
    },
    "papermill": {
     "duration": 0.024197,
     "end_time": "2020-12-18T18:14:05.328699",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.304502",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OverallSystemUsage not triggered\n"
     ]
    }
   ],
   "source": [
    "report = load_report('OverallSystemUsage')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.374380Z",
     "iopub.status.busy": "2020-12-18T18:14:05.373870Z",
     "iopub.status.idle": "2020-12-18T18:14:05.375969Z",
     "shell.execute_reply": "2020-12-18T18:14:05.375542Z"
    },
    "papermill": {
     "duration": 0.028576,
     "end_time": "2020-12-18T18:14:05.376086",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.347510",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "text1 = ''\n",
    "if report:\n",
    "    if \"GPU\" in report[\"Details\"]:\n",
    "        for node_id in report[\"Details\"][\"GPU\"]:\n",
    "            gpu_p95 = report[\"Details\"][\"GPU\"][node_id][\"p95\"]\n",
    "            gpu_p50 = report[\"Details\"][\"GPU\"][node_id][\"p50\"]\n",
    "            cpu_p95 = report[\"Details\"][\"CPU\"][node_id][\"p95\"]\n",
    "            cpu_p50 = report[\"Details\"][\"CPU\"][node_id][\"p50\"]\n",
    "            \n",
    "            if gpu_p95 < 70 and cpu_p95 < 70:\n",
    "                text1 = f\"\"\"{text1}The 95th percentile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. \n",
    "                The 95th percentile of the total CPU utilization is only {int(cpu_p95)}%. Node {node_id} is underutilized. \n",
    "                You may want to consider switching to a smaller instance type.\"\"\"\n",
    "            elif gpu_p95 < 70 and cpu_p95 > 70:\n",
    "                text1 = f\"\"\"{text1}The 95th percentile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. \n",
    "                However, the 95th percentile of the total CPU utilization is {int(cpu_p95)}%. GPUs on node {node_id} are underutilized \n",
    "                likely because of CPU bottlenecks\"\"\"\n",
    "            elif gpu_p50 > 70:\n",
    "                text1 = f\"\"\"{text1}The median total GPU utilization on node {node_id} is {int(gpu_p50)}%. \n",
    "                GPUs on node {node_id} are well utilized\"\"\"\n",
    "            else:\n",
    "                text1 = f\"\"\"{text1}The median total GPU utilization on node {node_id} is {int(gpu_p50)}%. \n",
    "                The median total CPU utilization is {int(cpu_p50)}%.\"\"\"\n",
    "    else:\n",
    "        for node_id in report[\"Details\"][\"CPU\"]:\n",
    "            cpu_p95 = report[\"Details\"][\"CPU\"][node_id][\"p95\"]\n",
    "            if cpu_p95 > 70:\n",
    "                text1 = f\"\"\"{text1}The 95th percentile of the total CPU utilization on node {node_id} is {int**(cpu_p95)}%. GPUs on node {node_id} are well utilized\"\"\"\n",
    "    text1 = Paragraph(text=f\"\"\"{text1}\"\"\", width=1100)\n",
    "    text2 = Paragraph(text=f\"\"\"The following table shows statistics of resource utilization per worker (node), \n",
    "    such as the total CPU and GPU utilization, and the memory utilization on CPU and GPU. \n",
    "    The table also includes the total I/O wait time and the total amount of data sent or received in bytes.\n",
    "    The table shows min and max values as well as p99, p90 and p50 percentiles.\"\"\", width=900)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.440506Z",
     "iopub.status.busy": "2020-12-18T18:14:05.439947Z",
     "iopub.status.idle": "2020-12-18T18:14:05.443780Z",
     "shell.execute_reply": "2020-12-18T18:14:05.443301Z"
    },
    "papermill": {
     "duration": 0.048731,
     "end_time": "2020-12-18T18:14:05.443895",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.395164",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
    "rows = [] \n",
    "units = {\"CPU\": \"percentage\", \"CPU memory\": \"percentage\", \"GPU\": \"percentage\", \"Network\": \"bytes\", \"GPU memory\": \"percentage\", \"I/O\": \"percentage\"}\n",
    "if report:\n",
    "    for metric in report['Details']:\n",
    "        for node_id in report['Details'][metric]:\n",
    "            values = report['Details'][metric][node_id]\n",
    "            rows.append([node_id, metric, units[metric], values['max'], values['p99'], values['p95'], values['p50'], values['min']])\n",
    "\n",
    "    df = pd.DataFrame(rows) \n",
    "    df.columns = ['Node', 'metric', 'unit', 'max', 'p99', 'p95', 'p50', 'min']\n",
    "    df2 = df.reset_index()\n",
    "    source = ColumnDataSource(data=df2)\n",
    "    columns = [TableColumn(field='Node', title=\"node\"),\n",
    "               TableColumn(field='metric', title=\"metric\"),\n",
    "               TableColumn(field='unit', title=\"unit\"),\n",
    "               TableColumn(field='max', title=\"max\"),\n",
    "               TableColumn(field='p99', title=\"p99\"),\n",
    "               TableColumn(field='p95', title=\"p95\"),\n",
    "               TableColumn(field='p50', title=\"p50\"),\n",
    "               TableColumn(field='min', title=\"min\"),]\n",
    "    table = DataTable(source=source, columns=columns, width=800, height=df2.shape[0]*30)\n",
    "\n",
    "    show(column( text1, text2, row(table)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.496294Z",
     "iopub.status.busy": "2020-12-18T18:14:05.494704Z",
     "iopub.status.idle": "2020-12-18T18:14:05.558384Z",
     "shell.execute_reply": "2020-12-18T18:14:05.558758Z"
    },
    "papermill": {
     "duration": 0.095286,
     "end_time": "2020-12-18T18:14:05.558918",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.463632",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OverallFrameworkMetrics not triggered\n"
     ]
    }
   ],
   "source": [
    "report = load_report('OverallFrameworkMetrics')\n",
    "if report:\n",
    "    if 'Details' in report:\n",
    "\n",
    "        display(Markdown(f\"\"\"## Framework metrics summary\"\"\"))\n",
    "        plots = []\n",
    "        text = ''\n",
    "        if 'phase' in report['Details']:\n",
    "            text = f\"\"\"The following two pie charts show the time spent on the TRAIN phase, the EVAL phase, \n",
    "            and others. The 'others' includes the time spent between steps (after one step has finished and before\n",
    "            the next step has started). Ideally, most of the training time should be spent on the \n",
    "            TRAIN and EVAL phases. If TRAIN/EVAL were not specified in the training script, steps will be recorded as \n",
    "            GLOBAL.\"\"\"\n",
    "\n",
    "            if 'others' in report['Details']['phase']:\n",
    "                others = float(report['Details']['phase']['others'])\n",
    "\n",
    "                if others > 25:\n",
    "                    text = f\"\"\"{text} Your training job spent quite a significant amount of time ({round(others,2)}%) in phase \"others\".\n",
    "                    You should check what is happening in between the steps.\"\"\"\n",
    "\n",
    "                plot = create_piechart(report['Details']['phase'], \n",
    "                                    height=350,\n",
    "                                    width=600,\n",
    "                                    x1=0.2,\n",
    "                                    x2=0.6,\n",
    "                                    radius=0.3, \n",
    "                                    title=\"The ratio between the time spent on the TRAIN/EVAL phase and others\")\n",
    "                plots.append(plot)\n",
    "\n",
    "        if 'forward_backward' in report['Details']:\n",
    "\n",
    "            event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n",
    "            perc = report['Details']['forward_backward'][event]\n",
    "\n",
    "            text = f\"\"\"{text} The pie chart on the right shows a more detailed breakdown. \n",
    "            It shows that {int(perc)}% of the time was spent in event \"{event}\".\"\"\"\n",
    "\n",
    "            if perc > 70:\n",
    "                text = f\"\"\"There is quite a significant difference between the time spent on forward and backward\n",
    "                pass.\"\"\"\n",
    "            else:\n",
    "                text = f\"\"\"{text} It shows that {int(perc)}% of the training time\n",
    "                was spent on \"{event}\".\"\"\"\n",
    "\n",
    "            plot = create_piechart(report['Details']['forward_backward'], \n",
    "                                height=350,\n",
    "                                width=600,\n",
    "                                x1=0.2,\n",
    "                                x2=0.6,\n",
    "                                radius=0.3, \n",
    "                                title=\"The ratio between forward and backward pass\") \n",
    "            plots.append(plot)\n",
    "\n",
    "        if len(plots) > 0:\n",
    "            paragraph = Paragraph(text=text, width=1100)\n",
    "            show(column(paragraph, row(plots)))\n",
    "\n",
    "        plots = []\n",
    "        text=''\n",
    "        if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n",
    "\n",
    "            key = list(report['Details']['ratio'].keys())[0]\n",
    "            ratio = report['Details']['ratio'][key]\n",
    "\n",
    "            text = f\"\"\"The following piechart shows a breakdown of the CPU/GPU operators. \n",
    "                It shows that {int(ratio)}% of training time was spent on executing the \"{key}\" operator.\"\"\"\n",
    "\n",
    "            plot = create_piechart(report['Details']['ratio'], \n",
    "                                    height=350,\n",
    "                                    width=600,\n",
    "                                    x1=0.2,\n",
    "                                    x2=0.6,\n",
    "                                    radius=0.3, \n",
    "                                    title=\"The ratio between the time spent on CPU/GPU operators\")\n",
    "            plots.append(plot)\n",
    "\n",
    "\n",
    "        if 'general' in report['Details']:\n",
    "            event = max(report['Details']['general'], key=report['Details']['general'].get)\n",
    "            perc = report['Details']['general'][event]\n",
    "\n",
    "            plot = create_piechart(report['Details']['general'], \n",
    "                                height=350,\n",
    "                                width=600,\n",
    "                                x1=0.2,\n",
    "                                x2=0.6,\n",
    "                                radius=0.3, \n",
    "                                title=\"General framework operations\")\n",
    "            plots.append(plot)\n",
    "\n",
    "        if len(plots) > 0:\n",
    "            paragraph = Paragraph(text=text, width=1100)\n",
    "            show(column(paragraph, row(plots)))\n",
    "\n",
    "        plots = []\n",
    "        text = ''\n",
    "        if 'horovod' in report['Details']:\n",
    "            display(Markdown(f\"\"\"#### Overview: Horovod metrics\"\"\"))\n",
    "            event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n",
    "            perc = report['Details']['horovod'][event]\n",
    "            text = f\"\"\"{text} The following pie chart shows a detailed breakdown of the Horovod metrics profiled\n",
    "            from your training job. The most expensive function was \"{event}\" with {int(perc)}%.\"\"\"\n",
    "\n",
    "            plot = create_piechart(report['Details']['horovod'], \n",
    "                                height=350,\n",
    "                                width=600,\n",
    "                                x1=0.2,\n",
    "                                x2=0.6,\n",
    "                                radius=0.3, \n",
    "                                title=\"Horovod metrics \")\n",
    "\n",
    "            paragraph = Paragraph(text=text, width=1100)\n",
    "            show(column(paragraph, row(plot)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.610922Z",
     "iopub.status.busy": "2020-12-18T18:14:05.610374Z",
     "iopub.status.idle": "2020-12-18T18:14:05.651647Z",
     "shell.execute_reply": "2020-12-18T18:14:05.652021Z"
    },
    "papermill": {
     "duration": 0.071537,
     "end_time": "2020-12-18T18:14:05.652179",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.580642",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
    "rows = [] \n",
    "values = []\n",
    "if report:\n",
    "    if 'CPU_total' in report['Details']:\n",
    "        display(Markdown(f\"\"\"#### Overview: CPU operators\"\"\"))\n",
    "        event = max(report['Details']['CPU'], key=report['Details']['CPU'].get)\n",
    "        perc = report['Details']['CPU'][event]\n",
    "\n",
    "        for function in report['Details']['CPU']:\n",
    "            percentage = round(report['Details']['CPU'][function],2)\n",
    "            time = report['Details']['CPU_total'][function]               \n",
    "            rows.append([percentage, time, function])\n",
    "\n",
    "        df = pd.DataFrame(rows) \n",
    "        df.columns = ['percentage', 'time', 'operator']\n",
    "\n",
    "        df = df.sort_values(by=['percentage'], ascending=False)\n",
    "        source = ColumnDataSource(data=df)\n",
    "        columns = [TableColumn(field='percentage', title=\"Percentage\"),\n",
    "                   TableColumn(field='time', title=\"Cumulative time in microseconds\"),\n",
    "                  TableColumn(field='operator', title=\"CPU operator\"),]\n",
    "\n",
    "        table = DataTable(source=source, columns=columns, width=550, height=350)\n",
    "\n",
    "        text = Paragraph(text=f\"\"\"The following table shows a list of operators that ran on the CPUs.\n",
    "        The most expensive operator on the CPUs was \"{event}\" with {int(perc)} %.\"\"\")\n",
    "\n",
    "        plot = create_piechart(report['Details']['CPU'],\n",
    "                                height=350,\n",
    "                                width=600,\n",
    "                                x1=0.2,\n",
    "                                x2=0.6,\n",
    "                                radius=0.3, \n",
    "                               )\n",
    "\n",
    "        show(column(text, row(table, plot)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.705843Z",
     "iopub.status.busy": "2020-12-18T18:14:05.705332Z",
     "iopub.status.idle": "2020-12-18T18:14:05.747861Z",
     "shell.execute_reply": "2020-12-18T18:14:05.748260Z"
    },
    "papermill": {
     "duration": 0.073916,
     "end_time": "2020-12-18T18:14:05.748410",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.674494",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
    "rows = [] \n",
    "values = []\n",
    "if report:\n",
    "    if 'GPU_total' in report['Details']:\n",
    "        display(Markdown(f\"\"\"#### Overview: GPU operators\"\"\"))\n",
    "        event = max(report['Details']['GPU'], key=report['Details']['GPU'].get)\n",
    "        perc = report['Details']['GPU'][event]\n",
    "\n",
    "        for function in report['Details']['GPU']:\n",
    "            percentage = round(report['Details']['GPU'][function],2)\n",
    "            time = report['Details']['GPU_total'][function]               \n",
    "            rows.append([percentage, time, function])\n",
    "\n",
    "        df = pd.DataFrame(rows) \n",
    "        df.columns = ['percentage', 'time', 'operator']\n",
    "\n",
    "        df = df.sort_values(by=['percentage'], ascending=False)\n",
    "        source = ColumnDataSource(data=df)\n",
    "        columns = [TableColumn(field='percentage', title=\"Percentage\"),\n",
    "                   TableColumn(field='time', title=\"Cumulative time in microseconds\"),\n",
    "                  TableColumn(field='operator', title=\"GPU operator\"),]\n",
    "        table = DataTable(source=source, columns=columns, width=450, height=350)\n",
    "\n",
    "        text = Paragraph(text=f\"\"\"The following table shows a list of operators that your training job ran on GPU.\n",
    "        The most expensive operator on GPU was \"{event}\" with {int(perc)} %\"\"\")\n",
    "\n",
    "        plot = create_piechart(report['Details']['GPU'],\n",
    "                                height=350,\n",
    "                                width=600,\n",
    "                                x1=0.2,\n",
    "                                x2=0.6,\n",
    "                                radius=0.3, \n",
    "                               )\n",
    "\n",
    "        show(column(text, row(table, plot)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.023376,
     "end_time": "2020-12-18T18:14:05.795507",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.772131",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Rules summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.848736Z",
     "iopub.status.busy": "2020-12-18T18:14:05.848216Z",
     "iopub.status.idle": "2020-12-18T18:14:05.850314Z",
     "shell.execute_reply": "2020-12-18T18:14:05.849916Z"
    },
    "papermill": {
     "duration": 0.031502,
     "end_time": "2020-12-18T18:14:05.850424",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.818922",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "description = {}\n",
    "description['CPUBottleneck'] = 'Checks if the CPU utilization is high and the GPU utilization is low. \\\n",
    "It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive \\\n",
    "from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue \\\n",
    "if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.'\n",
    "description['IOBottleneck'] =  'Checks if the data I/O wait time is high and the GPU utilization is low. \\\n",
    "It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. \\\n",
    "The rule evaluates the I/O and GPU utilization rates and triggers the issue \\\n",
    "if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.'\n",
    "description['Dataloader'] = 'Checks how many data loaders are running in parallel and whether the total number is equal the number \\\n",
    "of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. \\\n",
    "If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.'\n",
    "description['GPUMemoryIncrease'] = 'Measures the average GPU memory footprint and triggers if there is a large increase.'\n",
    "description['BatchSize'] = 'Checks if GPUs are underutilized because the batch size is too small. \\\n",
    "To detect this problem, the rule analyzes the average GPU memory footprint, \\\n",
    "the CPU and the GPU utilization. '\n",
    "description['LowGPUUtilization'] = 'Checks if the GPU utilization is low or fluctuating. \\\n",
    "This can happen due to bottlenecks, blocking calls for synchronizations, \\\n",
    "or a small batch size.'\n",
    "description['MaxInitializationTime'] = 'Checks if the time spent on initialization exceeds a threshold percent of the total training time. \\\n",
    "The rule waits until the first step of training loop starts. The initialization can take longer \\\n",
    "if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.'\n",
    "description['LoadBalancing'] = 'Detects workload balancing issues across GPUs. \\\n",
    "Workload imbalance can occur in training jobs with data parallelism. \\\n",
    "The gradients are accumulated on a primary GPU, and this GPU might be overused \\\n",
    "with regard to other GPUs, resulting in reducing the efficiency of data parallelization.'\n",
    "description['StepOutlier'] = 'Detects outliers in step duration. The step duration for forward and backward pass should be \\\n",
    "roughly the same throughout the training. If there are significant outliers, \\\n",
    "it may indicate a system stall or bottleneck issues.'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.902248Z",
     "iopub.status.busy": "2020-12-18T18:14:05.901727Z",
     "iopub.status.idle": "2020-12-18T18:14:05.903451Z",
     "shell.execute_reply": "2020-12-18T18:14:05.903820Z"
    },
    "papermill": {
     "duration": 0.029763,
     "end_time": "2020-12-18T18:14:05.903959",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.874196",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "recommendation = {}\n",
    "recommendation['CPUBottleneck'] = 'Consider increasing the number of data loaders \\\n",
    "or applying data pre-fetching.'\n",
    "recommendation['IOBottleneck'] = 'Pre-fetch data or choose different file formats, such as binary formats that \\\n",
    "improve I/O performance.'\n",
    "recommendation['Dataloader'] = 'Change the number of data loader processes.'\n",
    "recommendation['GPUMemoryIncrease'] = 'Choose a larger instance type with more memory if footprint is close to maximum available memory.'\n",
    "recommendation['BatchSize'] = 'The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.'\n",
    "recommendation['LowGPUUtilization'] = 'Check if there are bottlenecks, minimize blocking calls, \\\n",
    "change distributed training strategy, or increase the batch size.'\n",
    "recommendation['MaxInitializationTime'] = 'Initialization takes too long. \\\n",
    "If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.'\n",
    "recommendation['LoadBalancing'] = 'Choose a different distributed training strategy or \\\n",
    "a different distributed training framework.'\n",
    "recommendation['StepOutlier'] = 'Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:05.957048Z",
     "iopub.status.busy": "2020-12-18T18:14:05.956532Z",
     "iopub.status.idle": "2020-12-18T18:14:05.967392Z",
     "shell.execute_reply": "2020-12-18T18:14:05.967763Z"
    },
    "papermill": {
     "duration": 0.040483,
     "end_time": "2020-12-18T18:14:05.967898",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.927415",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "files = glob.glob('/opt/ml/processing/output/rule/profiler-output/profiler-reports/*json')\n",
    "summary = {}\n",
    "for i in files:\n",
    "    rule_name = i.split('/')[-1].replace('.json','')\n",
    "    if rule_name == \"OverallSystemUsage\" or rule_name == \"OverallFrameworkMetrics\":\n",
    "        continue\n",
    "    rule_report = json.load(open(i))\n",
    "    summary[rule_name] = {}\n",
    "    summary[rule_name]['Description'] = description[rule_name]\n",
    "    summary[rule_name]['Recommendation'] = recommendation[rule_name]\n",
    "#    summary[rule_name]['Number of times rule triggered'] = rule_report['RuleTriggered'] \n",
    "    #summary[rule_name]['Number of violations'] = rule_report['Violations'] \n",
    "    summary[rule_name]['Number of datapoints'] = rule_report['Datapoints']\n",
    "    summary[rule_name]['Rule parameters'] = rule_report['RuleParameters']\n",
    "\n",
    "df = pd.DataFrame.from_dict(summary, orient='index')\n",
    "#df = df.sort_values(by=['Number of times rule triggered'], ascending=False)\n",
    "\n",
    "\n",
    "#(Markdown(f\"\"\"The following table shows a profiling summary of the Debugger built-in rules. \n",
    "#The table is sorted by the rules that triggered the most frequently. During your training job, the {df.index[0]} rule\n",
    "#was the most frequently triggered. It processed {df.values[0,3]} datapoints and was triggered {df.values[0,2]} times.\"\"\"))\n",
    "\n",
    "with pd.option_context('display.colheader_justify','left'):    \n",
    "    pretty_print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.021661Z",
     "iopub.status.busy": "2020-12-18T18:14:06.021177Z",
     "iopub.status.idle": "2020-12-18T18:14:06.023963Z",
     "shell.execute_reply": "2020-12-18T18:14:06.023497Z"
    },
    "papermill": {
     "duration": 0.031825,
     "end_time": "2020-12-18T18:14:06.024070",
     "exception": false,
     "start_time": "2020-12-18T18:14:05.992245",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "## Analyzing the training loop\n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "analyse_phase = \"training\"\n",
    "if job_statistics and \"initialization_in_seconds\" in job_statistics:\n",
    "    if job_statistics[\"initialization_in_seconds\"] > job_statistics[\"training_loop_duration_in_seconds\"]:\n",
    "        analyse_phase = \"initialization\"\n",
    "        time = job_statistics[\"initialization_in_seconds\"]\n",
    "        perc = job_statistics[\"initialization_%\"]\n",
    "        display(Markdown(f\"\"\"The initialization phase took {int(time)} seconds, which is {int(perc)}%*\n",
    "        of the total training time. Since the training loop has taken the most time, \n",
    "        we dive deep into the events occurring during this phase\"\"\"))\n",
    "        display(Markdown(\"\"\"## Analyzing initialization\\n\\n\"\"\"))\n",
    "    time = job_statistics[\"training_loop_duration_in_seconds\"]\n",
    "    perc = job_statistics[\"training_loop_%\"]\n",
    "    display(Markdown(f\"\"\"The training loop lasted for {int(time)} seconds which is {int(perc)}% of the training job time.\n",
    "                    Since the training loop has taken the most time, we dive deep into the events occured during this phase.\"\"\"))\n",
    "if analyse_phase == 'training':\n",
    "    display(Markdown(\"\"\"## Analyzing the training loop\\n\\n\"\"\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.076518Z",
     "iopub.status.busy": "2020-12-18T18:14:06.076048Z",
     "iopub.status.idle": "2020-12-18T18:14:06.077739Z",
     "shell.execute_reply": "2020-12-18T18:14:06.078098Z"
    },
    "papermill": {
     "duration": 0.029795,
     "end_time": "2020-12-18T18:14:06.078223",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.048428",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "if analyse_phase == \"initialization\":\n",
    "    display(Markdown(\"\"\"### MaxInitializationTime\\n\\nThis rule helps to detect if the training initialization is taking too much time. \\nThe rule waits until first step is available. The rule takes the parameter `threshold` that defines how many minutes to wait for the first step to become available. Default is 20 minutes.\\nYou can run the rule locally in the following way:\n",
    "    \"\"\"))\n",
    "    \n",
    "    _ = load_report(\"MaxInitializationTime\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.148812Z",
     "iopub.status.busy": "2020-12-18T18:14:06.145111Z",
     "iopub.status.idle": "2020-12-18T18:14:06.204685Z",
     "shell.execute_reply": "2020-12-18T18:14:06.204268Z"
    },
    "papermill": {
     "duration": 0.101876,
     "end_time": "2020-12-18T18:14:06.204801",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.102925",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### Step duration analysis"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "StepOutlier not triggered\n"
     ]
    }
   ],
   "source": [
    "if analyse_phase == \"training\":\n",
    "    display(Markdown(\"\"\"### Step duration analysis\"\"\"))\n",
    "    report = load_report('StepOutlier')\n",
    "    if report:\n",
    "        parameters = report['RuleParameters']\n",
    "        params = report['RuleParameters'].split('\\n')\n",
    "        stddev = params[3].split(':')[1]\n",
    "        mode = params[1].split(':')[1]\n",
    "        n_outlier = params[2].split(':')[1]\n",
    "        triggered = report['RuleTriggered']\n",
    "        datapoints = report['Datapoints']\n",
    "\n",
    "        text = f\"\"\"The StepOutlier rule measures step durations and checks for outliers. The rule \n",
    "        returns True if duration is larger than {stddev} times the standard deviation. The rule \n",
    "        also takes the parameter mode, that specifies whether steps from training or validation phase \n",
    "        should be checked. In your processing job mode was specified as {mode}. \n",
    "        Typically the first step is taking significantly more time and to avoid the \n",
    "        rule triggering immediately, one can use n_outliers to specify the number of outliers to ignore. \n",
    "        n_outliers was set to {n_outlier}.\n",
    "        The rule analysed {datapoints} datapoints and triggered {triggered} times.\n",
    "        \"\"\"\n",
    "\n",
    "        paragraph = Paragraph(text=text, width=900)\n",
    "        show(column(paragraph))\n",
    "\n",
    "        if report and len(report['Details']['step_details']) > 0:\n",
    "            for node_id in report['Details']['step_details']:\n",
    "                tmp = report['RuleParameters'].split('threshold:')\n",
    "                threshold = tmp[1].split('\\n')[0]\n",
    "                n_outliers = report['Details']['step_details'][node_id]['number_of_outliers']\n",
    "                mean = report['Details']['step_details'][node_id]['step_stats']['mean']\n",
    "                stddev = report['Details']['step_details'][node_id]['stddev']\n",
    "                phase = report['Details']['step_details'][node_id]['phase']\n",
    "                display(Markdown(f\"\"\"**Step durations on node {node_id}:**\"\"\"))\n",
    "                display(Markdown(f\"\"\"The following table is a summary of the statistics of step durations measured on node {node_id}.\n",
    "                The rule has analyzed the step duration from {phase} phase.\n",
    "                The average step duration on node {node_id} was {round(mean, 2)}s. \n",
    "                The rule detected {n_outliers} outliers, where step duration was larger than {threshold} times the standard deviation of {stddev}s\n",
    "                                 \\n\"\"\"))\n",
    "                step_stats_df = pd.DataFrame.from_dict(report['Details']['step_details'][node_id]['step_stats'], orient='index').T\n",
    "                step_stats_df.index = ['Step Durations in [s]']\n",
    "                pretty_print(step_stats_df)\n",
    "\n",
    "            display(Markdown(f\"\"\"The following histogram shows the step durations measured on the different nodes. \n",
    "                You can turn on or turn off the visualization of histograms by selecting or unselecting the labels in the legend.\"\"\"))\n",
    "\n",
    "            plot = figure(plot_height=450, \n",
    "                              plot_width=850, \n",
    "                              title=f\"\"\"Step durations\"\"\")  \n",
    "\n",
    "            colors = bokeh.palettes.viridis(len(report['Details']['step_details']))\n",
    "\n",
    "            for index, node_id in enumerate(report['Details']['step_details']):\n",
    "                probs = report['Details']['step_details'][node_id]['probs']\n",
    "                binedges = report['Details']['step_details'][node_id]['binedges']\n",
    "\n",
    "                plot.quad( top=probs,\n",
    "                        bottom=0,\n",
    "                        left=binedges[:-1],\n",
    "                        right=binedges[1:],\n",
    "                        line_color=\"white\",\n",
    "                        fill_color=colors[index],\n",
    "                        fill_alpha=0.7,\n",
    "                        legend=node_id)\n",
    "\n",
    "            plot.add_layout(Legend(), 'right')    \n",
    "            plot.y_range.start = 0\n",
    "            plot.xaxis.axis_label = f\"\"\"Step durations in [s]\"\"\"\n",
    "            plot.yaxis.axis_label = \"Occurrences\"\n",
    "            plot.grid.grid_line_color = \"white\"\n",
    "            plot.legend.click_policy=\"hide\"\n",
    "            plot.legend.location = 'center_right'\n",
    "            show(plot)\n",
    "\n",
    "        if report['RuleTriggered'] > 0:\n",
    "\n",
    "            text=f\"\"\"To get a better understanding of what may have caused those outliers,\n",
    "            we correlate the timestamps of step outliers with other framework metrics that happened at the same time.\n",
    "            The left chart shows how much time was spent in the different framework\n",
    "            metrics aggregated by event phase. The chart on the right shows the histogram of normal step durations (without\n",
    "            outliers). The following chart shows how much time was spent in the different \n",
    "            framework metrics when step outliers occurred. In this chart framework metrics are not aggregated byphase.\"\"\"\n",
    "            plots = []\n",
    "            if 'phase' in report['Details']:\n",
    "                text = f\"\"\"{text} The chart (in the middle) shows whether step outliers mainly happened during TRAIN or EVAL phase.\n",
    "                \"\"\"\n",
    "\n",
    "                plot = create_piechart(report['Details']['phase'], \n",
    "                                    height=350,\n",
    "                                    width=600,\n",
    "                                    x1=0.2,\n",
    "                                    x2=0.6,\n",
    "                                    radius=0.3, \n",
    "                                    title=\"The ratio between the time spent on the TRAIN/EVAL phase\")\n",
    "                plots.append(plot)\n",
    "\n",
    "            if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:\n",
    "\n",
    "                event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n",
    "                perc = report['Details']['forward_backward'][event]\n",
    "\n",
    "                text = f\"\"\"{text} The pie chart on the right shows a detailed breakdown. \n",
    "                It shows that {int(perc)}% of the training time was spent on event \"{event}\".\"\"\"\n",
    "\n",
    "                plot = create_piechart(report['Details']['forward_backward'], \n",
    "                                    height=350,\n",
    "                                    width=600,\n",
    "                                    x1=0.2,\n",
    "                                    x2=0.6,\n",
    "                                    radius=0.3, \n",
    "                                    title=\"The Ratio between forward and backward pass\") \n",
    "                plots.append(plot)\n",
    "\n",
    "            if len(plots) > 0:\n",
    "                paragraph = Paragraph(text=text, width=900)\n",
    "                show(column(paragraph, row(plots)))\n",
    "\n",
    "            plots = []\n",
    "            text = \"\"\n",
    "            if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n",
    "\n",
    "                key = list(report['Details']['ratio'].keys())[0]\n",
    "                ratio = report['Details']['ratio'][key]\n",
    "\n",
    "                text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators executed during the step outliers. \n",
    "                    It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n",
    "\n",
    "                plot = create_piechart(report['Details']['ratio'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"The ratio between CPU/GPU operators\")\n",
    "                plots.append(plot)\n",
    "\n",
    "\n",
    "            if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n",
    "\n",
    "                event = max(report['Details']['general'], key=report['Details']['general'].get)\n",
    "                perc = report['Details']['general'][event]\n",
    "\n",
    "                plot = create_piechart(report['Details']['general'], \n",
    "                                    height=350,\n",
    "                                    width=600,\n",
    "                                    x1=0.2,\n",
    "                                    x2=0.6,\n",
    "                                    radius=0.3, \n",
    "                                    title=\"General metrics recorded in framework \")\n",
    "                plots.append(plot)\n",
    "\n",
    "            if len(plots) > 0:\n",
    "                paragraph = Paragraph(text=text, width=900)\n",
    "                show(column(paragraph, row(plots)))\n",
    "\n",
    "            plots = []\n",
    "            text = \"\"\n",
    "            if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n",
    "\n",
    "                event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n",
    "                perc = report['Details']['horovod'][event]\n",
    "                text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics that have been\n",
    "                recorded when step outliers happened. The most expensive function was {event} with {int(perc)}%\"\"\"\n",
    "\n",
    "                plot = create_piechart(report['Details']['horovod'], \n",
    "                                    height=350,\n",
    "                                    width=600,\n",
    "                                    x1=0.2,\n",
    "                                    x2=0.6,\n",
    "                                    radius=0.3, \n",
    "                                    title=\"General metrics recorded in framework \")\n",
    "\n",
    "                paragraph = Paragraph(text=text, width=900)\n",
    "                show(column(paragraph, row(plot)))      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.275939Z",
     "iopub.status.busy": "2020-12-18T18:14:06.275322Z",
     "iopub.status.idle": "2020-12-18T18:14:06.294858Z",
     "shell.execute_reply": "2020-12-18T18:14:06.295263Z"
    },
    "papermill": {
     "duration": 0.062956,
     "end_time": "2020-12-18T18:14:06.295414",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.232458",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### GPU utilization analysis\n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "**Usage per GPU** \n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LowGPUUtilization not triggered\n"
     ]
    }
   ],
   "source": [
    "if analyse_phase == \"training\":\n",
    "    display(Markdown(\"\"\"### GPU utilization analysis\\n\\n\"\"\"))\n",
    "    display(Markdown(\"\"\"**Usage per GPU** \\n\\n\"\"\"))\n",
    "    report = load_report('LowGPUUtilization')\n",
    "    if report:\n",
    "        params = report['RuleParameters'].split('\\n')\n",
    "        threshold_p95 = params[0].split(':')[1]\n",
    "        threshold_p5 = params[1].split(':')[1]\n",
    "        window = params[2].split(':')[1]\n",
    "        patience = params[3].split(':')[1]\n",
    "        violations = report['Violations']\n",
    "        triggered = report['RuleTriggered']\n",
    "        datapoints = report['Datapoints']\n",
    "        \n",
    "        text=Paragraph(text=f\"\"\"The LowGPUUtilization rule checks for a low and fluctuating GPU usage. If the GPU usage is \n",
    "        consistently low, it might be caused by bottlenecks or a small batch size. If usage is heavily \n",
    "        fluctuating, it can be due to bottlenecks or blocking calls. The rule computed the 95th and 5th \n",
    "        percentile of GPU utilization on {window} continuous datapoints and found {violations} cases where \n",
    "        p95 was above {threshold_p95}% and p5 was below {threshold_p5}%. If p95 is high and p5 is low,\n",
    "        it might indicate that the GPU usage is highly fluctuating. If both values are very low, \n",
    "        it would mean that the machine is underutilized. During initialization, the GPU usage is likely zero, \n",
    "        so the rule skipped the first {patience} data points.\n",
    "        The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\", width=800)\n",
    "        show(text)\n",
    "\n",
    "        \n",
    "        if len(report['Details']) > 0:\n",
    "            \n",
    "            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n",
    "            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n",
    "            day = date.date().strftime(\"%m/%d/%Y\")\n",
    "            hour = date.time().strftime(\"%H:%M:%S\")\n",
    "            text = Paragraph(text=f\"\"\"Your training job is underutilizing the instance. You may want to consider\n",
    "            to either switch to a smaller instance type or to increase the batch size. \n",
    "            The last time that the LowGPUUtilization rule was triggered in your training job was on {day} at {hour}.\n",
    "            The following boxplots are a snapshot from the timestamps. \n",
    "            They show the utilization per GPU (without outliers).\n",
    "            To get a better understanding of the workloads throughout the whole training,\n",
    "            you can check the workload histogram in the next section.\"\"\", width=800)\n",
    "            show(text)\n",
    "            \n",
    "            del report['Details']['last_timestamp']\n",
    "            \n",
    "            for node_id in report['Details']:\n",
    "                \n",
    "                plot = figure(plot_height=350, \n",
    "                          plot_width=1000,\n",
    "                          toolbar_location='right',\n",
    "                          tools=\"hover,wheel_zoom,reset,pan\", \n",
    "                          title=f\"Node {node_id}\",\n",
    "                          x_range=(0,17),\n",
    "                          )\n",
    "                \n",
    "                for index, key in enumerate(report['Details'][node_id]):\n",
    "                    display(Markdown(f\"\"\"**GPU utilization of {key} on node {node_id}:**\"\"\"))\n",
    "                    text = \"\"\n",
    "                    gpu_max = report['Details'][node_id][key]['gpu_max']\n",
    "                    p_95 = report['Details'][node_id][key]['gpu_95']\n",
    "                    p_5 = report['Details'][node_id][key]['gpu_5']\n",
    "                    text = f\"\"\"{text} The max utilization of {key} on node {node_id} was {gpu_max}%\"\"\"\n",
    "                    if p_95 < int(threshold_p95): \n",
    "                        text = f\"\"\"{text} and the 95th percentile was only {p_95}%. \n",
    "                        {key} on node {node_id} is underutilized\"\"\"\n",
    "                    if p_5 < int(threshold_p5): \n",
    "                        text = f\"\"\"{text} and the 5th percentile was only {p_5}%\"\"\"\n",
    "                    if p_95 - p_5 > 50:\n",
    "                        text = f\"\"\"{text} The difference between 5th percentile {p_5}% and 95th percentile {p_95}% is quite \n",
    "                        significant, which means that utilization on {key} is fluctuating quite a lot.\\n\"\"\"\n",
    "     \n",
    "                    upper = report['Details'][node_id][key]['upper']\n",
    "                    lower = report['Details'][node_id][key]['lower']\n",
    "                    p75 = report['Details'][node_id][key]['p75']\n",
    "                    p25 = report['Details'][node_id][key]['p25']\n",
    "                    p50 = report['Details'][node_id][key]['p50']\n",
    "\n",
    "                    plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n",
    "                    plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n",
    "\n",
    "                    plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n",
    "                    plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n",
    "\n",
    "                    plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n",
    "                    plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n",
    "\n",
    "                    plot.xaxis.major_label_overrides[index+1] = key\n",
    "                    plot.xgrid.grid_line_color = None\n",
    "                    plot.ygrid.grid_line_color = \"white\"\n",
    "                    plot.grid.grid_line_width = 0\n",
    "\n",
    "                    plot.xaxis.major_label_text_font_size=\"10px\"\n",
    "                    text=Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n",
    "                    show(text)\n",
    "                plot.yaxis.axis_label = \"Utilization in %\"\n",
    "                plot.xaxis.ticker = np.arange(index+2)\n",
    "                \n",
    "                show(plot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.365060Z",
     "iopub.status.busy": "2020-12-18T18:14:06.363113Z",
     "iopub.status.idle": "2020-12-18T18:14:06.419662Z",
     "shell.execute_reply": "2020-12-18T18:14:06.419246Z"
    },
    "papermill": {
     "duration": 0.095052,
     "end_time": "2020-12-18T18:14:06.419782",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.324730",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "**Workload balancing**\n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LoadBalancing not triggered\n"
     ]
    }
   ],
   "source": [
    " \n",
    "if analyse_phase == \"training\": \n",
    "    display(Markdown(\"\"\"**Workload balancing**\\n\\n\"\"\")) \n",
    "    report = load_report('LoadBalancing')\n",
    "    if report:\n",
    "        params = report['RuleParameters'].split('\\n')\n",
    "        threshold = params[0].split(':')[1]\n",
    "        patience = params[1].split(':')[1]\n",
    "        triggered = report['RuleTriggered']\n",
    "        datapoints = report['Datapoints']\n",
    "    \n",
    "        paragraph = Paragraph(text=f\"\"\"The LoadBalancing rule helps to detect issues in workload balancing \n",
    "        between multiple GPUs. \n",
    "        It computes a histogram of GPU utilization values for each GPU and compares then the \n",
    "        similarity between histograms. The rule checked if the distance of histograms is larger than the \n",
    "        threshold of {threshold}.\n",
    "        During initialization utilization is likely zero, so the rule skipped the first {patience} data points.\n",
    "        \"\"\", width=900)\n",
    "        show(paragraph)\n",
    "        \n",
    "        if len(report['Details']) > 0:\n",
    "            for node_id in report['Details']: \n",
    "                \n",
    "                \n",
    "                text = f\"\"\"The following histogram shows the workload per GPU on node {node_id}. \n",
    "                You can enable/disable the visualization of a workload by clicking on the label in the legend.\n",
    "                \"\"\"\n",
    "                if len(report['Details']) == 1 and len(report['Details'][node_id]['workloads']) == 1:\n",
    "                    text = f\"\"\"{text} Your training job only used one GPU so there is no workload balancing issue.\"\"\"\n",
    "                \n",
    "                plot = figure(plot_height=450, \n",
    "                              plot_width=850, \n",
    "                              x_range=(-1,100),\n",
    "                              title=f\"\"\"Workloads on node {node_id}\"\"\")\n",
    "                \n",
    "                colors = bokeh.palettes.viridis(len(report['Details'][node_id]['workloads']))\n",
    "                \n",
    "                for index, gpu_id2 in enumerate(report['Details'][node_id]['workloads']):\n",
    "                    probs = report['Details'][node_id]['workloads'][gpu_id2]\n",
    "                    plot.quad( top=probs,\n",
    "                                bottom=0,\n",
    "                                left=np.arange(0,98,2),\n",
    "                                right=np.arange(2,100,2),\n",
    "                                line_color=\"white\",\n",
    "                                fill_color=colors[index],\n",
    "                                fill_alpha=0.8,\n",
    "                                legend=gpu_id2 )\n",
    "\n",
    "                    plot.y_range.start = 0\n",
    "                    plot.xaxis.axis_label = f\"\"\"Utilization\"\"\"\n",
    "                    plot.yaxis.axis_label = \"Occurrences\"\n",
    "                    plot.grid.grid_line_color = \"white\"\n",
    "                    plot.legend.click_policy=\"hide\"\n",
    "                \n",
    "                paragraph = Paragraph(text=text)\n",
    "                show(column(paragraph, plot))\n",
    "                \n",
    "                if \"distances\" in report['Details'][node_id]:\n",
    "                    text = f\"\"\"The rule identified workload balancing issues on node {node_id} \n",
    "                    where workloads differed by more than threshold {threshold}. \n",
    "                    \"\"\"\n",
    "                    for index, gpu_id2 in enumerate(report['Details'][node_id]['distances']):\n",
    "                        for gpu_id1 in report['Details'][node_id]['distances'][gpu_id2]:\n",
    "                            distance = round(report['Details'][node_id]['distances'][gpu_id2][gpu_id1], 2)\n",
    "                            text = f\"\"\"{text} The difference of workload between {gpu_id2} and {gpu_id1} is: {distance}.\"\"\"\n",
    "\n",
    "                    paragraph = Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n",
    "                    show(column(paragraph))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.493434Z",
     "iopub.status.busy": "2020-12-18T18:14:06.492855Z",
     "iopub.status.idle": "2020-12-18T18:14:06.530672Z",
     "shell.execute_reply": "2020-12-18T18:14:06.531091Z"
    },
    "papermill": {
     "duration": 0.080689,
     "end_time": "2020-12-18T18:14:06.531240",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.450551",
     "status": "completed"
    },
    "scrolled": true,
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### Dataloading analysis\n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataloader not triggered\n"
     ]
    }
   ],
   "source": [
    "if analyse_phase == \"training\":\n",
    "    display(Markdown(\"\"\"### Dataloading analysis\\n\\n\"\"\"))\n",
    "    report = load_report('Dataloader')\n",
    "    if report:\n",
    "        params = report['RuleParameters'].split(\"\\n\")\n",
    "        min_threshold = params[0].split(':')[1]\n",
    "        max_threshold = params[1].split(':')[1]\n",
    "        triggered = report['RuleTriggered']\n",
    "        datapoints = report['Datapoints']\n",
    "    \n",
    "        text=f\"\"\"The number of dataloader workers can greatly affect the overall performance \n",
    "        of your training job. The rule analyzed the number of dataloading processes that have been running in \n",
    "        parallel on the training instance and compares it against the total number of cores. \n",
    "        The rule checked if the number of processes is smaller than {min_threshold}% or larger than \n",
    "        {max_threshold}% the total number of cores. Having too few dataloader workers can slowdown data preprocessing and lead to GPU \n",
    "        underutilization. Having too many dataloader workers may hurt the\n",
    "        overall performance if you are running other compute intensive tasks on the CPU.\n",
    "        The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\"\n",
    "        \n",
    "        paragraph = Paragraph(text=f\"{text}\", width=900)\n",
    "        show(paragraph)\n",
    "        text = \"\"\n",
    "        if 'cores' in report['Details']:\n",
    "            cores = int(report['Details']['cores'])\n",
    "            dataloaders = report['Details']['dataloaders']\n",
    "            if dataloaders < cores: \n",
    "                text=f\"\"\"{text} Your training instance provided {cores} CPU cores, however your training job only \n",
    "                ran on average {dataloaders} dataloader workers in parallel. We recommend you to increase the number of\n",
    "                dataloader workers.\"\"\"\n",
    "            if dataloaders > cores:\n",
    "                text=f\"\"\"{text} Your training instance provided {cores} CPU cores, however your training job ran \n",
    "                on average {dataloaders} dataloader workers. We recommed you to decrease the number of dataloader\n",
    "                workers.\"\"\"\n",
    "        if 'pin_memory' in report['Details'] and report['Details']['pin_memory'] == False:\n",
    "            text=f\"\"\"{text} Using pinned memory also improves performance because it enables fast data transfer to CUDA-enabled GPUs.\n",
    "            The rule detected that your training job was not using pinned memory. \n",
    "            In case of using PyTorch Dataloader, you can enable this by setting pin_memory=True.\"\"\"\n",
    "            \n",
    "        if 'prefetch' in report['Details'] and report['Details']['prefetch'] == False:\n",
    "            text=f\"\"\"{text} It appears that your training job did not perform any data pre-fetching. Pre-fetching can improve your\n",
    "            data input pipeline as it produces the data ahead of time.\"\"\"\n",
    "        paragraph = Paragraph(text=f\"{text}\", width=900)\n",
    "        show(paragraph)\n",
    "        \n",
    "        colors=bokeh.palettes.viridis(10)\n",
    "        if \"dataloading_time\" in report['Details']:\n",
    "            median = round(report['Details'][\"dataloading_time\"]['p50'],4)\n",
    "            p95 = round(report['Details'][\"dataloading_time\"]['p95'],4)\n",
    "            p25 = round(report['Details'][\"dataloading_time\"]['p25'],4)\n",
    "            binedges = report['Details'][\"dataloading_time\"]['binedges']\n",
    "            probs = report['Details'][\"dataloading_time\"]['probs']\n",
    "            text=f\"\"\"The following histogram shows the distribution of dataloading times that have been measured throughout your training job. The median dataloading time was {median}s. \n",
    "            The 95th percentile was {p95}s and the 25th percentile was {p25}s\"\"\"\n",
    "\n",
    "            plot = figure(plot_height=450, \n",
    "                              plot_width=850,\n",
    "                              toolbar_location='right',\n",
    "                              tools=\"hover,wheel_zoom,reset,pan\",\n",
    "                              x_range=(binedges[0], binedges[-1])\n",
    "                              )\n",
    "            \n",
    "            plot.quad( top=probs,\n",
    "                        bottom=0,\n",
    "                        left=binedges[:-1],\n",
    "                        right=binedges[1:],\n",
    "                        line_color=\"white\",\n",
    "                        fill_color=colors[0],\n",
    "                        fill_alpha=0.8,\n",
    "                        legend=\"Dataloading events\" )\n",
    "\n",
    "            plot.y_range.start = 0\n",
    "            plot.xaxis.axis_label = f\"\"\"Dataloading in [s]\"\"\"\n",
    "            plot.yaxis.axis_label = \"Occurrences\"\n",
    "            plot.grid.grid_line_color = \"white\"\n",
    "            plot.legend.click_policy=\"hide\"\n",
    "\n",
    "            paragraph = Paragraph(text=f\"{text}\", width=900)\n",
    "            show(column(paragraph, plot))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.609289Z",
     "iopub.status.busy": "2020-12-18T18:14:06.608584Z",
     "iopub.status.idle": "2020-12-18T18:14:06.630407Z",
     "shell.execute_reply": "2020-12-18T18:14:06.629902Z"
    },
    "papermill": {
     "duration": 0.066838,
     "end_time": "2020-12-18T18:14:06.630528",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.563690",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       " ### Batch size"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BatchSize not triggered\n"
     ]
    }
   ],
   "source": [
    "if analyse_phase == \"training\":\n",
    "    display(Markdown(\"\"\" ### Batch size\"\"\"))\n",
    "    report = load_report('BatchSize')\n",
    "    if report:\n",
    "        params = report['RuleParameters'].split('\\n')\n",
    "        cpu_threshold_p95 = int(params[0].split(':')[1])\n",
    "        gpu_threshold_p95 = int(params[1].split(':')[1])\n",
    "        gpu_memory_threshold_p95 = int(params[2].split(':')[1])\n",
    "        patience = int(params[3].split(':')[1])\n",
    "        window = int(params[4].split(':')[1])\n",
    "        violations = report['Violations']\n",
    "        triggered = report['RuleTriggered']\n",
    "        datapoints = report['Datapoints']\n",
    "        \n",
    "        text = Paragraph(text=f\"\"\"The BatchSize rule helps to detect if GPU is underutilized because of the batch size being \n",
    "        too small. To detect this the rule analyzes the GPU memory footprint, CPU and GPU utilization. The rule checked if the 95th percentile of CPU utilization is below cpu_threshold_p95 of \n",
    "        {cpu_threshold_p95}%, the 95th percentile of GPU utilization is below gpu_threshold_p95 of {gpu_threshold_p95}% and the 95th percentile of memory footprint \\\n",
    "        below gpu_memory_threshold_p95 of {gpu_memory_threshold_p95}%. In your training job this happened {violations} times. \\\n",
    "        The rule skipped the first {patience} datapoints. The rule computed the percentiles over window size of {window} continuous datapoints.\\n\n",
    "        The rule analysed {datapoints} datapoints and triggered {triggered} times.\n",
    "        \"\"\", width=800)\n",
    "        show(text)\n",
    "        if len(report['Details']) >0: \n",
    "            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n",
    "            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n",
    "            day = date.date().strftime(\"%m/%d/%Y\")\n",
    "            hour = date.time().strftime(\"%H:%M:%S\")\n",
    "            del report['Details']['last_timestamp']\n",
    "            text = Paragraph(text=f\"\"\"Your training job is underutilizing the instance. You may want to consider\n",
    "            either switch to a smaller instance type or to increase the batch size. \n",
    "            The last time the BatchSize rule triggered in your training job was on {day} at {hour}.\n",
    "            The following boxplots are a snapshot from the timestamps. They the total \n",
    "            CPU utilization, the GPU utilization, and the GPU memory usage per GPU (without outliers).\"\"\", \n",
    "            width=800)\n",
    "            show(text)\n",
    "\n",
    "            for node_id in report['Details']:\n",
    "                xmax = max(20, len(report['Details'][node_id]))\n",
    "                \n",
    "                plot = figure(plot_height=350, \n",
    "                          plot_width=1000,\n",
    "                          toolbar_location='right',\n",
    "                          tools=\"hover,wheel_zoom,reset,pan\", \n",
    "                          title=f\"Node {node_id}\",\n",
    "                          x_range=(0,xmax)\n",
    "                          )\n",
    "                \n",
    "                for index, key in enumerate(report['Details'][node_id]):\n",
    "                        upper = report['Details'][node_id][key]['upper']\n",
    "                        lower = report['Details'][node_id][key]['lower']\n",
    "                        p75 = report['Details'][node_id][key]['p75']\n",
    "                        p25 = report['Details'][node_id][key]['p25']\n",
    "                        p50 = report['Details'][node_id][key]['p50']\n",
    "\n",
    "                        plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n",
    "                        plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n",
    "\n",
    "                        plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n",
    "                        plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n",
    "\n",
    "                        plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n",
    "                        plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n",
    "\n",
    "                        plot.xaxis.major_label_overrides[index+1] = key\n",
    "                        plot.xgrid.grid_line_color = None\n",
    "                        plot.ygrid.grid_line_color = \"white\"\n",
    "                        plot.grid.grid_line_width = 0\n",
    "\n",
    "                        plot.xaxis.major_label_text_font_size=\"10px\"\n",
    "                plot.xaxis.ticker = np.arange(index+2)\n",
    "                plot.yaxis.axis_label = \"Utilization in %\"\n",
    "                show(plot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.716097Z",
     "iopub.status.busy": "2020-12-18T18:14:06.712546Z",
     "iopub.status.idle": "2020-12-18T18:14:06.736724Z",
     "shell.execute_reply": "2020-12-18T18:14:06.737106Z"
    },
    "papermill": {
     "duration": 0.073123,
     "end_time": "2020-12-18T18:14:06.737253",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.664130",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### CPU bottlenecks\n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPUBottleneck not triggered\n"
     ]
    }
   ],
   "source": [
    "if analyse_phase == \"training\": \n",
    "    display(Markdown(\"\"\"### CPU bottlenecks\\n\\n\"\"\"))\n",
    "\n",
    "    report = load_report('CPUBottleneck')\n",
    "    if report:\n",
    "        params = report['RuleParameters'].split('\\n')\n",
    "        threshold = int(params[0].split(':')[1])\n",
    "        cpu_threshold = int(params[1].split(':')[1])\n",
    "        gpu_threshold = int(params[2].split(':')[1])\n",
    "        patience = int(params[3].split(':')[1])\n",
    "        violations = report['Violations']\n",
    "        triggered = report['RuleTriggered']\n",
    "        datapoints = report['Datapoints']\n",
    "        \n",
    "        if report['Violations'] > 0:\n",
    "            perc = int(report['Violations']/report['Datapoints']*100)\n",
    "        else:\n",
    "            perc = 0\n",
    "        if perc < threshold:\n",
    "            string = 'below'\n",
    "        else:\n",
    "            string = 'above'\n",
    "        text = f\"\"\"The CPUBottleneck rule checked when the CPU utilization was above cpu_threshold of {cpu_threshold}% \n",
    "        and GPU utilization was below gpu_threshold of {gpu_threshold}%. \n",
    "        During initialization utilization is likely to be zero, so the rule skipped the first {patience} datapoints.\n",
    "        With this configuration the rule found {violations} CPU bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%\n",
    "        The rule analysed {datapoints} data points and triggered {triggered} times.\"\"\"\n",
    "        \n",
    "        paragraph = Paragraph(text=text, width=900)\n",
    "        show(paragraph)\n",
    "        if report:\n",
    "\n",
    "            plots = []\n",
    "            text = \"\"\n",
    "            if report['RuleTriggered'] > 0:\n",
    "\n",
    "                low_gpu = report['Details']['low_gpu_utilization']\n",
    "                cpu_bottleneck = {}\n",
    "                cpu_bottleneck[\"GPU usage above threshold\"] = report[\"Datapoints\"] - report[\"Details\"][\"low_gpu_utilization\"]\n",
    "                cpu_bottleneck[\"GPU usage below threshold\"] = report[\"Details\"][\"low_gpu_utilization\"] - len(report[\"Details\"])\n",
    "                cpu_bottleneck[\"Low GPU usage due to CPU bottlenecks\"] = len(report[\"Details\"][\"bottlenecks\"])\n",
    "\n",
    "                n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)\n",
    "                text = f\"\"\"The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%\n",
    "                and how many of those datapoints were likely caused by a CPU bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization \n",
    "                below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by CPU bottlenecks. \n",
    "                \"\"\"\n",
    "\n",
    "                plot = create_piechart(cpu_bottleneck, \n",
    "                                    height=350,\n",
    "                                    width=600,\n",
    "                                    x1=0.2,\n",
    "                                    x2=0.6,\n",
    "                                    radius=0.3, \n",
    "                                    title=\"Low GPU usage caused by CPU bottlenecks\")\n",
    "\n",
    "                plots.append(plot)\n",
    "\n",
    "                if 'phase' in report['Details']:\n",
    "                    text = f\"\"\"{text} The chart (in the middle) shows whether CPU bottlenecks mainly \n",
    "                    happened during train/validation phase.\n",
    "                    \"\"\"\n",
    "\n",
    "                    plot = create_piechart(report['Details']['phase'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"The ratio between time spent on TRAIN/EVAL phase\")\n",
    "                    plots.append(plot)\n",
    "\n",
    "                if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:\n",
    "\n",
    "                    event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n",
    "                    perc = report['Details']['forward_backward'][event]\n",
    "\n",
    "                    text = f\"\"\"{text} The pie charts on the right shows a more detailed breakdown. \n",
    "                    It shows that {int(perc)}% of the training time was spent on event {event}\"\"\"\n",
    "\n",
    "                    plot = create_piechart(report['Details']['forward_backward'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"The ratio between forward and backward pass\") \n",
    "                    plots.append(plot)\n",
    "\n",
    "                if len(plots) > 0:\n",
    "                    paragraph = Paragraph(text=text, width=900)\n",
    "                    show(column(paragraph, row(plots)))\n",
    "\n",
    "                plots = []\n",
    "                text = \"\"\n",
    "                if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n",
    "\n",
    "                    key = list(report['Details']['ratio'].keys())[0]\n",
    "                    ratio = report['Details']['ratio'][key]\n",
    "\n",
    "                    text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators that happened during CPU bottlenecks. \n",
    "                        It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n",
    "\n",
    "                    plot = create_piechart(report['Details']['ratio'], \n",
    "                                            height=350,\n",
    "                                            width=600,\n",
    "                                            x1=0.2,\n",
    "                                            x2=0.6,\n",
    "                                            radius=0.3, \n",
    "                                            title=\"The ratio between CPU/GPU operators\")\n",
    "                    plots.append(plot)\n",
    "\n",
    "\n",
    "                if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n",
    "\n",
    "                    event = max(report['Details']['general'], key=report['Details']['general'].get)\n",
    "                    perc = report['Details']['general'][event]\n",
    "                \n",
    "                    plot = create_piechart(report['Details']['general'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"General metrics recorded in framework \")\n",
    "                    plots.append(plot)\n",
    "\n",
    "                if len(plots) > 0:\n",
    "                    paragraph = Paragraph(text=text, width=900)\n",
    "                    show(column(paragraph, row(plots)))\n",
    "\n",
    "                plots = []\n",
    "                text = \"\"\n",
    "                if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n",
    "\n",
    "                    event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n",
    "                    perc = report['Details']['horovod'][event]\n",
    "                    text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics \n",
    "                    that have been recorded when the CPU bottleneck happened. The most expensive function was \n",
    "                    {event} with {int(perc)}%\"\"\"\n",
    "\n",
    "                    plot = create_piechart(report['Details']['horovod'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"General metrics recorded in framework \")\n",
    "\n",
    "                    paragraph = Paragraph(text=text, width=900)\n",
    "                    show(column(paragraph, row(plot)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.821596Z",
     "iopub.status.busy": "2020-12-18T18:14:06.813130Z",
     "iopub.status.idle": "2020-12-18T18:14:06.846403Z",
     "shell.execute_reply": "2020-12-18T18:14:06.845907Z"
    },
    "papermill": {
     "duration": 0.07505,
     "end_time": "2020-12-18T18:14:06.846523",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.771473",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### I/O bottlenecks\n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "IOBottleneck not triggered\n"
     ]
    }
   ],
   "source": [
    "if analyse_phase == \"training\": \n",
    "    display(Markdown(\"\"\"### I/O bottlenecks\\n\\n\"\"\"))\n",
    "\n",
    "    report = load_report('IOBottleneck')\n",
    "    if report:\n",
    "        params = report['RuleParameters'].split('\\n')\n",
    "        threshold = int(params[0].split(':')[1])\n",
    "        io_threshold = int(params[1].split(':')[1])\n",
    "        gpu_threshold = int(params[2].split(':')[1])\n",
    "        patience = int(params[3].split(':')[1])\n",
    "        violations = report['Violations']\n",
    "        triggered = report['RuleTriggered']\n",
    "        datapoints = report['Datapoints']\n",
    "    \n",
    "        if report['Violations'] > 0:\n",
    "            perc = int(report['Violations']/report['Datapoints']*100)\n",
    "        else:\n",
    "            perc = 0\n",
    "        if perc < threshold:\n",
    "            string = 'below'\n",
    "        else:\n",
    "            string = 'above'\n",
    "        text = f\"\"\"The IOBottleneck rule checked when I/O wait time was above io_threshold of {io_threshold}% \n",
    "        and GPU utilization was below gpu_threshold of {gpu_threshold}. During initialization utilization is likely to be zero, so the rule skipped the first {patience} datapoints. \n",
    "        With this configuration the rule found {violations} I/O bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%.\n",
    "        The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\"\n",
    "        paragraph = Paragraph(text=text, width=900)\n",
    "        show(paragraph)\n",
    "        \n",
    "        if report:\n",
    "\n",
    "            plots = []\n",
    "            text = \"\"\n",
    "            if report['RuleTriggered'] > 0:\n",
    "\n",
    "                low_gpu = report['Details']['low_gpu_utilization']\n",
    "                cpu_bottleneck = {}\n",
    "                cpu_bottleneck[\"GPU usage above threshold\"] = report[\"Datapoints\"] - report[\"Details\"][\"low_gpu_utilization\"]\n",
    "                cpu_bottleneck[\"GPU usage below threshold\"] = report[\"Details\"][\"low_gpu_utilization\"] - len(report[\"Details\"])\n",
    "                cpu_bottleneck[\"Low GPU usage due to I/O bottlenecks\"] = len(report[\"Details\"][\"bottlenecks\"])\n",
    "\n",
    "                n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)\n",
    "                text = f\"\"\"The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%\n",
    "                and how many of those datapoints were likely caused by a I/O bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization \n",
    "                below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by I/O bottlenecks. \n",
    "                \"\"\"\n",
    "\n",
    "                plot = create_piechart(cpu_bottleneck, \n",
    "                                    height=350,\n",
    "                                    width=600,\n",
    "                                    x1=0.2,\n",
    "                                    x2=0.6,\n",
    "                                    radius=0.3, \n",
    "                                    title=\"Low GPU usage caused by I/O bottlenecks\")\n",
    "\n",
    "                plots.append(plot)\n",
    "\n",
    "                if 'phase' in report['Details']:\n",
    "                    text = f\"\"\"{text} The chart (in the middle) shows whether I/O bottlenecks mainly happened during  trianing or validation phase.\n",
    "                    \"\"\"\n",
    "\n",
    "                    plot = create_piechart(report['Details']['phase'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"The ratio between the time spent on the TRAIN/EVAL phase\")\n",
    "                    plots.append(plot)\n",
    "\n",
    "                if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:\n",
    "\n",
    "                    event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n",
    "                    perc = report['Details']['forward_backward'][event]\n",
    "\n",
    "                    text = f\"\"\"{text} The pie charts on the right shows a more detailed breakdown. \n",
    "                    It shows that {int(perc)}% of the training time was spent on event \"{event}\".\"\"\"\n",
    "\n",
    "                    plot = create_piechart(report['Details']['forward_backward'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"The ratio between forward and backward pass\") \n",
    "                    plots.append(plot)\n",
    "\n",
    "                if len(plots) > 0:\n",
    "                    paragraph = Paragraph(text=text, width=900)\n",
    "                    show(column(paragraph, row(plots)))\n",
    "\n",
    "                plots = []\n",
    "                text = \"\"\n",
    "                if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n",
    "\n",
    "                    key = list(report['Details']['ratio'].keys())[0]\n",
    "                    ratio = report['Details']['ratio'][key]\n",
    "\n",
    "                    text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators that happened \n",
    "                    during I/O bottlenecks. It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n",
    "\n",
    "                    plot = create_piechart(report['Details']['ratio'], \n",
    "                                            height=350,\n",
    "                                            width=600,\n",
    "                                            x1=0.2,\n",
    "                                            x2=0.6,\n",
    "                                            radius=0.3, \n",
    "                                            title=\"Ratio between CPU/GPU operators\")\n",
    "                    plots.append(plot)\n",
    "\n",
    "\n",
    "                if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n",
    "\n",
    "                    event = max(report['Details']['general'], key=report['Details']['general'].get)\n",
    "                    perc = report['Details']['general'][event]\n",
    "\n",
    "                    plot = create_piechart(report['Details']['general'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"General metrics recorded in framework \")\n",
    "                    plots.append(plot)\n",
    "\n",
    "                if len(plots) > 0:\n",
    "                    paragraph = Paragraph(text=text, width=900)\n",
    "                    show(column(paragraph, row(plots)))\n",
    "\n",
    "                plots = []\n",
    "                text = \"\"\n",
    "                if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n",
    "\n",
    "                    event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n",
    "                    perc = report['Details']['horovod'][event]\n",
    "                    text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics that have been\n",
    "                    recorded when I/O bottleneck happened. The most expensive function was {event} with {int(perc)}%\"\"\"\n",
    "\n",
    "                    plot = create_piechart(report['Details']['horovod'], \n",
    "                                        height=350,\n",
    "                                        width=600,\n",
    "                                        x1=0.2,\n",
    "                                        x2=0.6,\n",
    "                                        radius=0.3, \n",
    "                                        title=\"General metrics recorded in framework \")\n",
    "\n",
    "                    paragraph = Paragraph(text=text, width=900)\n",
    "                    show(column(paragraph, row(plot)))    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-12-18T18:14:06.933012Z",
     "iopub.status.busy": "2020-12-18T18:14:06.932440Z",
     "iopub.status.idle": "2020-12-18T18:14:06.952462Z",
     "shell.execute_reply": "2020-12-18T18:14:06.952836Z"
    },
    "papermill": {
     "duration": 0.069815,
     "end_time": "2020-12-18T18:14:06.952980",
     "exception": false,
     "start_time": "2020-12-18T18:14:06.883165",
     "status": "completed"
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### GPU memory\n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GPUMemoryIncrease not triggered\n"
     ]
    }
   ],
   "source": [
    "if analyse_phase == \"training\":\n",
    "    display(Markdown(\"\"\"### GPU memory\\n\\n\"\"\"))\n",
    "    \n",
    "    report = load_report('GPUMemoryIncrease')\n",
    "    if report:\n",
    "        params = report['RuleParameters'].split('\\n')\n",
    "        increase = float(params[0].split(':')[1])\n",
    "        patience = params[1].split(':')[1]\n",
    "        window = params[2].split(':')[1]\n",
    "        violations = report['Violations']\n",
    "        triggered = report['RuleTriggered']\n",
    "        datapoints = report['Datapoints']\n",
    "    \n",
    "        text=Paragraph(text=f\"\"\"The GPUMemoryIncrease rule helps to detect large increase in memory usage on GPUs. \n",
    "        The rule checked if the moving average of memory increased by more than {increase}%. \n",
    "        So if the moving average increased for instance from 10% to {11+increase}%, \n",
    "        the rule would have triggered. During initialization utilization  is likely 0, so the rule skipped the first {patience} datapoints.\n",
    "        The moving average was computed on a window size of {window} continuous datapoints. The rule detected {violations} violations\n",
    "        where the moving average between previous and current time window increased by more than {increase}%.\n",
    "        The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\",\n",
    "                       width=900)\n",
    "        show(text)\n",
    "\n",
    "        if len(report['Details']) > 0:\n",
    "            \n",
    "            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n",
    "            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n",
    "            day = date.date().strftime(\"%m/%d/%Y\")\n",
    "            hour = date.time().strftime(\"%H:%M:%S\")\n",
    "            text = Paragraph(text=f\"\"\"Your training job triggered memory spikes. \n",
    "            The last time the GPUMemoryIncrease rule triggered in your training job was on {day} at {hour}.\n",
    "            The following boxplots are a snapshot from the timestamps. They show for each node and GPU the corresponding\n",
    "            memory utilization (without outliers).\"\"\", width=900)\n",
    "            show(text)\n",
    "            \n",
    "            del report['Details']['last_timestamp']\n",
    "            \n",
    "            for node_id in report['Details']:\n",
    "    \n",
    "                plot = figure(plot_height=350, \n",
    "                          plot_width=1000,\n",
    "                          toolbar_location='right',\n",
    "                          tools=\"hover,wheel_zoom,reset,pan\", \n",
    "                          title=f\"Node {node_id}\",\n",
    "                          x_range=(0,17),\n",
    "                          )\n",
    "\n",
    "                for index, key in enumerate(report['Details'][node_id]):\n",
    "                    display(Markdown(f\"\"\"**Memory utilization of {key} on node {node_id}:**\"\"\"))\n",
    "                    text = \"\"\n",
    "                    gpu_max = report['Details'][node_id][key]['gpu_max']\n",
    "                    text = f\"\"\"{text} The max memory utilization of {key} on node {node_id} was {gpu_max}%.\"\"\"\n",
    "                    \n",
    "                    p_95 = int(report['Details'][node_id][key]['p95'])\n",
    "                    p_5 = report['Details'][node_id][key]['p05']\n",
    "                    if p_95 < int(50): \n",
    "                        text = f\"\"\"{text} The 95th percentile was only {p_95}%.\"\"\"\n",
    "                    if p_5 < int(5): \n",
    "                        text = f\"\"\"{text} The 5th percentile was only {p_5}%.\"\"\"\n",
    "                    if p_95 - p_5 > 50:\n",
    "                        text = f\"\"\"{text} The difference between 5th percentile {p_5}% and 95th percentile {p_95}% is quite \n",
    "                        significant, which means that memory utilization on {key} is fluctuating quite a lot.\"\"\"\n",
    "                        \n",
    "                    text = Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n",
    "                    show(text)\n",
    "                    \n",
    "                    upper = report['Details'][node_id][key]['upper']\n",
    "                    lower = report['Details'][node_id][key]['lower']\n",
    "                    p75 = report['Details'][node_id][key]['p75']\n",
    "                    p25 = report['Details'][node_id][key]['p25']\n",
    "                    p50 = report['Details'][node_id][key]['p50']\n",
    "\n",
    "                    plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n",
    "                    plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n",
    "\n",
    "                    plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n",
    "                    plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n",
    "\n",
    "                    plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n",
    "                    plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n",
    "\n",
    "                    plot.xaxis.major_label_overrides[index+1] = key\n",
    "                    plot.xgrid.grid_line_color = None\n",
    "                    plot.ygrid.grid_line_color = \"white\"\n",
    "                    plot.grid.grid_line_width = 0\n",
    "\n",
    "                    plot.xaxis.major_label_text_font_size=\"10px\"\n",
    "                plot.xaxis.ticker = np.arange(index+2)\n",
    "                plot.yaxis.axis_label = \"Utilization in %\"\n",
    "                show(plot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  },
  "papermill": {
   "duration": 3.931089,
   "end_time": "2020-12-18T18:14:07.395423",
   "environment_variables": {},
   "exception": null,
   "input_path": "/opt/ml/code/profiler_report.ipynb",
   "output_path": "/opt/ml/processing/output/rule/profiler-output/.sagemaker-ignore/out.tmp",
   "parameters": {},
   "start_time": "2020-12-18T18:14:03.464334",
   "version": "2.1.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
